{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 半监督词分类算法\n",
    "https://blog.csdn.net/Yellow_python/article/details/100940617"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from jieba import cut\n",
    "from re import fullmatch\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from collections import Counter"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "读语料"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('x_train.txt', encoding='utf-8') as f:\n",
    "    x_train = f.read().strip().split(\"\\n\")\n",
    "with open('y_train.txt', encoding='utf-8') as f:\n",
    "    y_train = f.read().strip().split(\"\\n\")\n",
    "with open('x_test.txt', encoding='utf-8') as f:\n",
    "    x_test = f.read().strip().split(\"\\n\")\n",
    "with open('y_test.txt', encoding='utf-8') as f:\n",
    "    y_test = f.read().strip().split(\"\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "分词器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "match = lambda word: fullmatch('[a-zA-Z\\u4e00-\\u9fa5]+', word)\n",
    "stopwords = set('的了很买是都还也我就在这那又里哦和')\n",
    "tokenizer = lambda text: (word for word in cut(text, HMM=False) if match(word) and word not in stopwords)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "向量化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache C:\\Users\\HONGJI~1\\AppData\\Local\\Temp\\jieba.cache\n",
      "Loading model cost 0.764 seconds.\n",
      "Prefix dict has been built succesfully.\n"
     ]
    }
   ],
   "source": [
    "vectorizer = TfidfVectorizer(tokenizer=tokenizer)\n",
    "X_train = vectorizer.fit_transform(x_train)\n",
    "X_test = vectorizer.transform(x_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "分类模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.9138333333333334\n"
     ]
    }
   ],
   "source": [
    "clf = MultinomialNB()\n",
    "clf.fit(X_train, y_train)\n",
    "print(clf.score(X_test, y_test))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "可视化函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "yellow = lambda w: '\\033[033m{}\\033[0m'.format(w)\n",
    "hot = lambda i: '\\033[034m%.2f\\033[0m' % i if i < .5 else '\\033[031m%.2f\\033[0m' % i"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 全监督词分类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "苹果 \u001b[033m1789\u001b[0m \u001b[031m0.61\u001b[0m\n",
      "好 \u001b[033m1555\u001b[0m \u001b[031m0.81\u001b[0m\n",
      "不 \u001b[033m1424\u001b[0m \u001b[034m0.25\u001b[0m\n",
      "京东 \u001b[033m1088\u001b[0m \u001b[031m0.63\u001b[0m\n",
      "吃 \u001b[033m867\u001b[0m \u001b[031m0.66\u001b[0m\n",
      "有 \u001b[033m769\u001b[0m \u001b[034m0.39\u001b[0m\n",
      "不错 \u001b[033m719\u001b[0m \u001b[031m0.94\u001b[0m\n",
      "小 \u001b[033m623\u001b[0m \u001b[034m0.31\u001b[0m\n",
      "新鲜 \u001b[033m595\u001b[0m \u001b[031m0.60\u001b[0m\n",
      "水果 \u001b[033m549\u001b[0m \u001b[034m0.42\u001b[0m\n",
      "一个 \u001b[033m546\u001b[0m \u001b[034m0.42\u001b[0m\n",
      "好吃 \u001b[033m540\u001b[0m \u001b[031m0.75\u001b[0m\n",
      "个 \u001b[033m509\u001b[0m \u001b[034m0.34\u001b[0m\n",
      "差 \u001b[033m504\u001b[0m \u001b[034m0.06\u001b[0m\n",
      "甜 \u001b[033m476\u001b[0m \u001b[031m0.72\u001b[0m\n",
      "脆 \u001b[033m476\u001b[0m \u001b[031m0.90\u001b[0m\n",
      "没有 \u001b[033m473\u001b[0m \u001b[031m0.54\u001b[0m\n",
      "没 \u001b[033m454\u001b[0m \u001b[034m0.46\u001b[0m\n",
      "烂 \u001b[033m450\u001b[0m \u001b[034m0.12\u001b[0m\n",
      "坏 \u001b[033m442\u001b[0m \u001b[034m0.34\u001b[0m\n",
      "给 \u001b[033m442\u001b[0m \u001b[034m0.41\u001b[0m\n",
      "太 \u001b[033m439\u001b[0m \u001b[034m0.23\u001b[0m\n",
      "大 \u001b[033m435\u001b[0m \u001b[031m0.60\u001b[0m\n",
      "味道 \u001b[033m402\u001b[0m \u001b[031m0.70\u001b[0m\n",
      "非常 \u001b[033m343\u001b[0m \u001b[031m0.70\u001b[0m\n",
      "包装 \u001b[033m341\u001b[0m \u001b[031m0.83\u001b[0m\n",
      "快递 \u001b[033m338\u001b[0m \u001b[031m0.71\u001b[0m\n",
      "就是 \u001b[033m331\u001b[0m \u001b[031m0.63\u001b[0m\n",
      "到 \u001b[033m320\u001b[0m \u001b[031m0.54\u001b[0m\n",
      "说 \u001b[033m319\u001b[0m \u001b[034m0.29\u001b[0m\n",
      "个头 \u001b[033m311\u001b[0m \u001b[031m0.67\u001b[0m\n",
      "快 \u001b[033m306\u001b[0m \u001b[031m0.89\u001b[0m\n",
      "这个 \u001b[033m300\u001b[0m \u001b[031m0.51\u001b[0m\n",
      "可以 \u001b[033m298\u001b[0m \u001b[031m0.75\u001b[0m\n",
      "还是 \u001b[033m279\u001b[0m \u001b[034m0.49\u001b[0m\n",
      "东西 \u001b[033m272\u001b[0m \u001b[031m0.54\u001b[0m\n",
      "多 \u001b[033m262\u001b[0m \u001b[031m0.53\u001b[0m\n",
      "挺 \u001b[033m257\u001b[0m \u001b[031m0.86\u001b[0m\n",
      "再 \u001b[033m250\u001b[0m \u001b[031m0.54\u001b[0m\n",
      "物流 \u001b[033m247\u001b[0m \u001b[031m0.80\u001b[0m\n",
      "果 \u001b[033m243\u001b[0m \u001b[034m0.46\u001b[0m\n",
      "购买 \u001b[033m234\u001b[0m \u001b[031m0.70\u001b[0m\n",
      "火龙果 \u001b[033m232\u001b[0m \u001b[034m0.04\u001b[0m\n",
      "吧 \u001b[033m229\u001b[0m \u001b[034m0.47\u001b[0m\n",
      "收到 \u001b[033m224\u001b[0m \u001b[034m0.49\u001b[0m\n",
      "第一次 \u001b[033m218\u001b[0m \u001b[034m0.39\u001b[0m\n",
      "这次 \u001b[033m217\u001b[0m \u001b[034m0.45\u001b[0m\n",
      "失望 \u001b[033m213\u001b[0m \u001b[034m0.08\u001b[0m\n",
      "比 \u001b[033m212\u001b[0m \u001b[031m0.58\u001b[0m\n",
      "满意 \u001b[033m207\u001b[0m \u001b[031m0.65\u001b[0m\n",
      "上 \u001b[033m206\u001b[0m \u001b[034m0.47\u001b[0m\n",
      "不是 \u001b[033m203\u001b[0m \u001b[034m0.50\u001b[0m\n",
      "一次 \u001b[033m198\u001b[0m \u001b[034m0.32\u001b[0m\n",
      "口感 \u001b[033m196\u001b[0m \u001b[031m0.72\u001b[0m\n",
      "而且 \u001b[033m194\u001b[0m \u001b[034m0.28\u001b[0m\n",
      "以后 \u001b[033m184\u001b[0m \u001b[034m0.38\u001b[0m\n",
      "酸 \u001b[033m181\u001b[0m \u001b[034m0.26\u001b[0m\n",
      "超市 \u001b[033m181\u001b[0m \u001b[031m0.57\u001b[0m\n",
      "来 \u001b[033m176\u001b[0m \u001b[031m0.63\u001b[0m\n",
      "过 \u001b[033m174\u001b[0m \u001b[034m0.46\u001b[0m\n",
      "知道 \u001b[033m173\u001b[0m \u001b[034m0.38\u001b[0m\n",
      "感觉 \u001b[033m172\u001b[0m \u001b[031m0.53\u001b[0m\n",
      "有点 \u001b[033m172\u001b[0m \u001b[031m0.63\u001b[0m\n",
      "速度 \u001b[033m172\u001b[0m \u001b[031m0.88\u001b[0m\n",
      "送货 \u001b[033m170\u001b[0m \u001b[031m0.89\u001b[0m\n",
      "一点 \u001b[033m167\u001b[0m \u001b[034m0.24\u001b[0m\n",
      "啊 \u001b[033m166\u001b[0m \u001b[034m0.35\u001b[0m\n",
      "但是 \u001b[033m165\u001b[0m \u001b[031m0.66\u001b[0m\n",
      "不会 \u001b[033m164\u001b[0m \u001b[034m0.15\u001b[0m\n",
      "这样 \u001b[033m163\u001b[0m \u001b[034m0.18\u001b[0m\n",
      "大小 \u001b[033m161\u001b[0m \u001b[031m0.74\u001b[0m\n",
      "会 \u001b[033m158\u001b[0m \u001b[031m0.83\u001b[0m\n",
      "还有 \u001b[033m158\u001b[0m \u001b[034m0.19\u001b[0m\n",
      "真的 \u001b[033m157\u001b[0m \u001b[034m0.40\u001b[0m\n",
      "比较 \u001b[033m157\u001b[0m \u001b[031m0.82\u001b[0m\n",
      "评 \u001b[033m156\u001b[0m \u001b[034m0.12\u001b[0m\n",
      "购物 \u001b[033m155\u001b[0m \u001b[034m0.40\u001b[0m\n",
      "两个 \u001b[033m155\u001b[0m \u001b[034m0.20\u001b[0m\n",
      "喜欢 \u001b[033m155\u001b[0m \u001b[031m0.93\u001b[0m\n",
      "下次 \u001b[033m153\u001b[0m \u001b[031m0.67\u001b[0m\n",
      "价格 \u001b[033m151\u001b[0m \u001b[031m0.74\u001b[0m\n",
      "特别 \u001b[033m148\u001b[0m \u001b[034m0.49\u001b[0m\n",
      "要 \u001b[033m148\u001b[0m \u001b[034m0.43\u001b[0m\n",
      "不好 \u001b[033m147\u001b[0m \u001b[034m0.15\u001b[0m\n",
      "人 \u001b[033m147\u001b[0m \u001b[034m0.41\u001b[0m\n",
      "这么 \u001b[033m147\u001b[0m \u001b[034m0.19\u001b[0m\n",
      "很甜 \u001b[033m146\u001b[0m \u001b[031m0.94\u001b[0m\n",
      "看 \u001b[033m140\u001b[0m \u001b[034m0.37\u001b[0m\n",
      "点 \u001b[033m139\u001b[0m \u001b[031m0.71\u001b[0m\n",
      "你 \u001b[033m138\u001b[0m \u001b[034m0.26\u001b[0m\n",
      "赞 \u001b[033m138\u001b[0m \u001b[031m0.95\u001b[0m\n",
      "货 \u001b[033m135\u001b[0m \u001b[034m0.30\u001b[0m\n",
      "什么 \u001b[033m134\u001b[0m \u001b[034m0.22\u001b[0m\n",
      "一 \u001b[033m134\u001b[0m \u001b[034m0.31\u001b[0m\n",
      "质量 \u001b[033m134\u001b[0m \u001b[031m0.59\u001b[0m\n",
      "吗 \u001b[033m133\u001b[0m \u001b[034m0.13\u001b[0m\n",
      "但 \u001b[033m132\u001b[0m \u001b[031m0.66\u001b[0m\n",
      "已经 \u001b[033m130\u001b[0m \u001b[031m0.53\u001b[0m\n",
      "能 \u001b[033m128\u001b[0m \u001b[034m0.40\u001b[0m\n",
      "三个 \u001b[033m128\u001b[0m \u001b[034m0.07\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "words = Counter(word for text in x_train for word in tokenizer(text))\n",
    "for word, freq in words.most_common(100):\n",
    "    pred = clf.predict_proba(vectorizer.transform([word]))[0][1]\n",
    "    print(word, yellow(freq), hot(pred))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 半监督词分类"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "差评 \u001b[033m216\u001b[0m \u001b[034m0.08\u001b[0m \u001b[034m0.00\u001b[0m\n",
      "很脆 \u001b[033m113\u001b[0m \u001b[031m0.89\u001b[0m \u001b[031m0.96\u001b[0m\n",
      "还会 \u001b[033m106\u001b[0m \u001b[031m0.90\u001b[0m \u001b[031m0.98\u001b[0m\n",
      "不甜 \u001b[033m96\u001b[0m \u001b[034m0.29\u001b[0m \u001b[034m0.15\u001b[0m\n",
      "脆甜 \u001b[033m96\u001b[0m \u001b[031m0.88\u001b[0m \u001b[031m0.95\u001b[0m\n",
      "这是 \u001b[033m81\u001b[0m \u001b[034m0.28\u001b[0m \u001b[034m0.19\u001b[0m\n",
      "太差 \u001b[033m78\u001b[0m \u001b[034m0.12\u001b[0m \u001b[034m0.00\u001b[0m\n",
      "买过 \u001b[033m64\u001b[0m \u001b[031m0.58\u001b[0m \u001b[031m0.58\u001b[0m\n",
      "不买 \u001b[033m64\u001b[0m \u001b[034m0.16\u001b[0m \u001b[034m0.03\u001b[0m\n",
      "全是 \u001b[033m60\u001b[0m \u001b[034m0.15\u001b[0m \u001b[034m0.03\u001b[0m\n",
      "不脆 \u001b[033m60\u001b[0m \u001b[034m0.33\u001b[0m \u001b[034m0.23\u001b[0m\n",
      "一看 \u001b[033m56\u001b[0m \u001b[034m0.20\u001b[0m \u001b[034m0.11\u001b[0m\n",
      "很差 \u001b[033m55\u001b[0m \u001b[034m0.16\u001b[0m \u001b[034m0.02\u001b[0m\n",
      "给力 \u001b[033m55\u001b[0m \u001b[031m0.89\u001b[0m \u001b[031m0.96\u001b[0m\n",
      "脆脆 \u001b[033m54\u001b[0m \u001b[031m0.90\u001b[0m \u001b[031m0.98\u001b[0m\n",
      "坏果 \u001b[033m51\u001b[0m \u001b[031m0.53\u001b[0m \u001b[031m0.55\u001b[0m\n",
      "一星 \u001b[033m48\u001b[0m \u001b[034m0.14\u001b[0m \u001b[034m0.02\u001b[0m\n",
      "网购 \u001b[033m47\u001b[0m \u001b[034m0.30\u001b[0m \u001b[034m0.23\u001b[0m\n",
      "好用 \u001b[033m47\u001b[0m \u001b[031m0.53\u001b[0m \u001b[031m0.98\u001b[0m\n",
      "还行 \u001b[033m40\u001b[0m \u001b[031m0.70\u001b[0m \u001b[031m0.82\u001b[0m\n",
      "甜脆 \u001b[033m40\u001b[0m \u001b[031m0.89\u001b[0m \u001b[031m0.95\u001b[0m\n",
      "中果 \u001b[033m36\u001b[0m \u001b[034m0.16\u001b[0m \u001b[034m0.06\u001b[0m\n",
      "无语 \u001b[033m32\u001b[0m \u001b[034m0.10\u001b[0m \u001b[034m0.00\u001b[0m\n",
      "大果 \u001b[033m31\u001b[0m \u001b[034m0.16\u001b[0m \u001b[034m0.06\u001b[0m\n",
      "送人 \u001b[033m29\u001b[0m \u001b[034m0.41\u001b[0m \u001b[034m0.34\u001b[0m\n",
      "超快 \u001b[033m29\u001b[0m \u001b[031m0.92\u001b[0m \u001b[031m0.97\u001b[0m\n",
      "冷链 \u001b[033m28\u001b[0m \u001b[031m0.78\u001b[0m \u001b[031m0.86\u001b[0m\n",
      "不大 \u001b[033m28\u001b[0m \u001b[031m0.55\u001b[0m \u001b[031m0.50\u001b[0m\n",
      "点赞 \u001b[033m28\u001b[0m \u001b[031m0.90\u001b[0m \u001b[031m1.00\u001b[0m\n",
      "有坏 \u001b[033m27\u001b[0m \u001b[034m0.43\u001b[0m \u001b[034m0.37\u001b[0m\n",
      "差差 \u001b[033m26\u001b[0m \u001b[034m0.05\u001b[0m \u001b[034m0.00\u001b[0m\n",
      "很足 \u001b[033m26\u001b[0m \u001b[031m0.90\u001b[0m \u001b[031m1.00\u001b[0m\n",
      "个装 \u001b[033m24\u001b[0m \u001b[031m0.61\u001b[0m \u001b[031m0.62\u001b[0m\n",
      "华圣 \u001b[033m23\u001b[0m \u001b[031m0.75\u001b[0m \u001b[031m0.87\u001b[0m\n",
      "看图 \u001b[033m22\u001b[0m \u001b[034m0.22\u001b[0m \u001b[034m0.23\u001b[0m\n",
      "小个 \u001b[033m22\u001b[0m \u001b[034m0.41\u001b[0m \u001b[034m0.32\u001b[0m\n",
      "买到 \u001b[033m21\u001b[0m \u001b[034m0.27\u001b[0m \u001b[034m0.19\u001b[0m\n",
      "不太 \u001b[033m21\u001b[0m \u001b[034m0.41\u001b[0m \u001b[034m0.33\u001b[0m\n",
      "太大 \u001b[033m21\u001b[0m \u001b[034m0.42\u001b[0m \u001b[034m0.29\u001b[0m\n",
      "有个 \u001b[033m20\u001b[0m \u001b[034m0.49\u001b[0m \u001b[031m0.50\u001b[0m\n",
      "挺脆 \u001b[033m20\u001b[0m \u001b[031m0.90\u001b[0m \u001b[031m1.00\u001b[0m\n",
      "偏小 \u001b[033m19\u001b[0m \u001b[031m0.57\u001b[0m \u001b[031m0.58\u001b[0m\n",
      "没好 \u001b[033m19\u001b[0m \u001b[034m0.26\u001b[0m \u001b[034m0.11\u001b[0m\n",
      "赞赞 \u001b[033m19\u001b[0m \u001b[031m0.95\u001b[0m \u001b[031m1.00\u001b[0m\n",
      "总重 \u001b[033m18\u001b[0m \u001b[031m0.57\u001b[0m \u001b[031m0.67\u001b[0m\n",
      "很赞 \u001b[033m18\u001b[0m \u001b[031m0.88\u001b[0m \u001b[031m1.00\u001b[0m\n",
      "有烂 \u001b[033m18\u001b[0m \u001b[034m0.32\u001b[0m \u001b[034m0.11\u001b[0m\n",
      "有图 \u001b[033m17\u001b[0m \u001b[034m0.22\u001b[0m \u001b[034m0.18\u001b[0m\n",
      "没坏 \u001b[033m17\u001b[0m \u001b[031m0.51\u001b[0m \u001b[034m0.47\u001b[0m\n",
      "太坑 \u001b[033m16\u001b[0m \u001b[034m0.10\u001b[0m \u001b[034m0.00\u001b[0m\n",
      "宏辉 \u001b[033m16\u001b[0m \u001b[031m0.60\u001b[0m \u001b[031m0.81\u001b[0m\n",
      "小果 \u001b[033m15\u001b[0m \u001b[034m0.16\u001b[0m \u001b[034m0.07\u001b[0m\n",
      "不说 \u001b[033m15\u001b[0m \u001b[034m0.13\u001b[0m \u001b[034m0.07\u001b[0m\n",
      "太酸 \u001b[033m15\u001b[0m \u001b[034m0.24\u001b[0m \u001b[034m0.13\u001b[0m\n",
      "不像 \u001b[033m14\u001b[0m \u001b[034m0.47\u001b[0m \u001b[031m0.50\u001b[0m\n",
      "佳农 \u001b[033m14\u001b[0m \u001b[034m0.29\u001b[0m \u001b[034m0.29\u001b[0m\n",
      "烂果 \u001b[033m14\u001b[0m \u001b[034m0.48\u001b[0m \u001b[034m0.43\u001b[0m\n",
      "没见 \u001b[033m13\u001b[0m \u001b[034m0.11\u001b[0m \u001b[034m0.08\u001b[0m\n",
      "kg \u001b[033m13\u001b[0m \u001b[034m0.46\u001b[0m \u001b[031m0.54\u001b[0m\n",
      "果果 \u001b[033m13\u001b[0m \u001b[031m0.78\u001b[0m \u001b[031m0.92\u001b[0m\n",
      "小贵 \u001b[033m13\u001b[0m \u001b[031m0.77\u001b[0m \u001b[031m0.92\u001b[0m\n",
      "很香 \u001b[033m13\u001b[0m \u001b[031m0.92\u001b[0m \u001b[031m1.00\u001b[0m\n",
      "个果 \u001b[033m12\u001b[0m \u001b[034m0.33\u001b[0m \u001b[034m0.25\u001b[0m\n",
      "很酸 \u001b[033m12\u001b[0m \u001b[034m0.29\u001b[0m \u001b[034m0.17\u001b[0m\n",
      "遍 \u001b[033m12\u001b[0m \u001b[034m0.25\u001b[0m \u001b[034m0.00\u001b[0m\n",
      "晚到 \u001b[033m12\u001b[0m \u001b[034m0.27\u001b[0m \u001b[034m0.00\u001b[0m\n",
      "我要 \u001b[033m12\u001b[0m \u001b[034m0.24\u001b[0m \u001b[034m0.17\u001b[0m\n",
      "挺大 \u001b[033m12\u001b[0m \u001b[031m0.83\u001b[0m \u001b[031m0.92\u001b[0m\n",
      "带点 \u001b[033m12\u001b[0m \u001b[031m0.78\u001b[0m \u001b[031m0.83\u001b[0m\n",
      "之选 \u001b[033m12\u001b[0m \u001b[031m0.89\u001b[0m \u001b[031m1.00\u001b[0m\n",
      "小不说 \u001b[033m11\u001b[0m \u001b[034m0.14\u001b[0m \u001b[034m0.09\u001b[0m\n",
      "整箱 \u001b[033m11\u001b[0m \u001b[034m0.27\u001b[0m \u001b[034m0.18\u001b[0m\n",
      "长得 \u001b[033m11\u001b[0m \u001b[034m0.14\u001b[0m \u001b[034m0.00\u001b[0m\n",
      "没熟 \u001b[033m11\u001b[0m \u001b[034m0.27\u001b[0m \u001b[034m0.09\u001b[0m\n",
      "别买 \u001b[033m11\u001b[0m \u001b[034m0.10\u001b[0m \u001b[034m0.00\u001b[0m\n",
      "带皮 \u001b[033m11\u001b[0m \u001b[031m0.76\u001b[0m \u001b[031m0.91\u001b[0m\n",
      "还贵 \u001b[033m11\u001b[0m \u001b[034m0.20\u001b[0m \u001b[034m0.00\u001b[0m\n",
      "点个 \u001b[033m11\u001b[0m \u001b[031m0.82\u001b[0m \u001b[031m0.91\u001b[0m\n",
      "街边 \u001b[033m10\u001b[0m \u001b[034m0.14\u001b[0m \u001b[034m0.00\u001b[0m\n",
      "小到 \u001b[033m10\u001b[0m \u001b[034m0.26\u001b[0m \u001b[034m0.10\u001b[0m\n",
      "终于 \u001b[033m10\u001b[0m \u001b[031m0.55\u001b[0m \u001b[034m0.40\u001b[0m\n",
      "小得 \u001b[033m10\u001b[0m \u001b[034m0.14\u001b[0m \u001b[034m0.00\u001b[0m\n",
      "是因为 \u001b[033m10\u001b[0m \u001b[034m0.26\u001b[0m \u001b[034m0.10\u001b[0m\n",
      "再来 \u001b[033m10\u001b[0m \u001b[031m0.65\u001b[0m \u001b[031m0.70\u001b[0m\n",
      "不红 \u001b[033m10\u001b[0m \u001b[034m0.41\u001b[0m \u001b[034m0.40\u001b[0m\n",
      "直采 \u001b[033m10\u001b[0m \u001b[031m0.55\u001b[0m \u001b[031m0.60\u001b[0m\n",
      "全烂 \u001b[033m10\u001b[0m \u001b[034m0.09\u001b[0m \u001b[034m0.00\u001b[0m\n",
      "西柚 \u001b[033m10\u001b[0m \u001b[034m0.06\u001b[0m \u001b[034m0.00\u001b[0m\n",
      "压伤 \u001b[033m10\u001b[0m \u001b[034m0.38\u001b[0m \u001b[034m0.30\u001b[0m\n",
      "寻真 \u001b[033m9\u001b[0m \u001b[034m0.31\u001b[0m \u001b[034m0.33\u001b[0m\n",
      "还坏 \u001b[033m9\u001b[0m \u001b[034m0.13\u001b[0m \u001b[034m0.00\u001b[0m\n",
      "没人 \u001b[033m9\u001b[0m \u001b[034m0.15\u001b[0m \u001b[034m0.00\u001b[0m\n",
      "为证 \u001b[033m9\u001b[0m \u001b[034m0.20\u001b[0m \u001b[034m0.11\u001b[0m\n",
      "如图 \u001b[033m9\u001b[0m \u001b[034m0.33\u001b[0m \u001b[034m0.33\u001b[0m\n",
      "手掌 \u001b[033m9\u001b[0m \u001b[034m0.17\u001b[0m \u001b[034m0.00\u001b[0m\n",
      "不酸 \u001b[033m9\u001b[0m \u001b[031m0.56\u001b[0m \u001b[031m0.67\u001b[0m\n",
      "啊啊啊 \u001b[033m9\u001b[0m \u001b[034m0.44\u001b[0m \u001b[034m0.33\u001b[0m\n",
      "京豆 \u001b[033m9\u001b[0m \u001b[034m0.30\u001b[0m \u001b[034m0.22\u001b[0m\n",
      "豆子 \u001b[033m9\u001b[0m \u001b[031m0.63\u001b[0m \u001b[031m1.00\u001b[0m\n",
      "这批 \u001b[033m8\u001b[0m \u001b[034m0.41\u001b[0m \u001b[034m0.38\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "y_pred = clf.predict_proba(X_test)\n",
    "c1, c2, c3 = Counter(), Counter(), Counter()\n",
    "for text, pred in zip(x_test, y_pred):\n",
    "    for word in cut(text):\n",
    "        if word not in words and match(word) and word not in stopwords:\n",
    "            c1[word] += 1\n",
    "            c2[word] += pred[1]\n",
    "            c3[word] += 0 if pred[0] > pred[1] else 1\n",
    "for word, freq in c1.most_common(100):\n",
    "    print(word, yellow(freq), hot(c2[word] / freq), hot(c3[word] / freq))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
